This document compares the data from forest and nursery populations of Phytophthora ramorum. Forest populations are isolated from Curry County, OR between 2001 and 2014. Nursery populations were isolated between 2000 and 2012 from both California and Oregon nurseries. Both populations are combined in a data set called for2nur.
library(PramCurry)
## Loading required package: poppr
## Loading required package: adegenet
## Loading required package: ade4
## ==========================
## adegenet 1.4-2 is loaded
## ==========================
##
## - to start, type '?adegenet'
## - to browse adegenet website, type 'adegenetWeb()'
## - to post questions/comments: adegenet-forum@lists.r-forge.r-project.org
##
##
## This is poppr version 1.1.2.99.70. To get started, type package?poppr
library(poppr)
library(reshape2)
library(ggplot2)
library(ape)
library(igraph)
##
## Attaching package: 'igraph'
##
## The following object is masked from 'package:ape':
##
## edges
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
options(stringsAsFactors = FALSE)
data(ramdat)
data(for2nur)
data(pop_data)
data(myPal)
data(comparePal)
sessionInfo()
## R version 3.1.2 (2014-10-31)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] dplyr_0.2 igraph_0.7.1 ape_3.1-4.5 ggplot2_1.0.0
## [5] reshape2_1.4 PramCurry_1.0 poppr_1.1.2.99-70 adegenet_1.4-2
## [9] ade4_1.6-2
##
## loaded via a namespace (and not attached):
## [1] assertthat_0.1 bitops_1.0-6 boot_1.3-11
## [4] caTools_1.17.1 colorspace_1.2-4 digest_0.6.4
## [7] evaluate_0.5.5 fastmatch_1.0-4 formatR_1.0
## [10] gdata_2.13.3 ggmap_2.3 grid_3.1.2
## [13] gtable_0.1.2 gtools_3.4.1 htmltools_0.2.6
## [16] httpuv_1.3.0 knitr_1.6 labtools_0.4.1
## [19] lattice_0.20-29 lubridate_1.3.3 mapproj_1.2-2
## [22] maps_2.3-9 MASS_7.3-34 Matrix_1.1-4
## [25] memoise_0.2.1 munsell_0.4.2 nlme_3.1-117
## [28] parallel_3.1.2 pegas_0.6 permute_0.8-3
## [31] phangorn_1.99-7 plotrix_3.5-7 plyr_1.8.1
## [34] png_0.1-7 proto_0.3-10 Rcpp_0.11.2
## [37] RgoogleMaps_1.2.0.6 rjson_0.2.14 RJSONIO_1.3-0
## [40] rmarkdown_0.3.10 scales_0.2.4 shiny_0.10.1
## [43] stringr_0.6.2 tools_3.1.2 vegan_2.0-10
## [46] xtable_1.7-4 yaml_2.1.13
for2nur
##
## This is a genclone object
## -------------------------
## Genotype information:
##
## 98 multilocus genotypes
## 729 diploid individuals
## 5 codominant loci
##
## Population information:
##
## 3 hierarchical levels - SOURCE YEAR STATE
## 9 populations defined - Nursery_CA Nursery_OR JHallCr_OR ...
## Winchuck_OR ChetcoMain_OR PistolRSF_OR
summary(for2nur)
##
## # Total number of genotypes: 729
##
## # Population sample sizes:
## Nursery_CA Nursery_OR JHallCr_OR NFChetHigh_OR Coast_OR
## 145 71 244 114 34
## HunterCr_OR Winchuck_OR ChetcoMain_OR PistolRSF_OR
## 66 35 16 4
##
## # Number of alleles per locus:
## L1 L2 L3 L4 L5
## 2 2 7 3 24
##
## # Number of alleles per population:
## 1 2 3 4 5 6 7 8 9
## 24 25 22 24 19 13 17 15 12
##
## # Percentage of missing data:
## [1] 0
##
## # Observed heterozygosity:
## L1 L2 L3 L4 L5
## 0.9575 0.9588 0.9986 0.9808 0.9890
##
## # Expected heterozygosity:
## L1 L2 L3 L4 L5
## 0.4992 0.4992 0.6186 0.5006 0.8467
knitr::kable(poppr(for2nur, quiet = TRUE))
| Pop | N | MLG | eMLG | SE | H | G | Hexp | E.5 | Ia | rbarD | File |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Nursery_CA | 145 | 30 | 6.524 | 1.2968 | 2.5878 | 7.555 | 0.8737 | 0.5329 | 0.1868 | 0.1983 | for2nur |
| Nursery_OR | 71 | 19 | 6.544 | 1.2210 | 2.4150 | 7.468 | 0.8785 | 0.6348 | 0.0302 | 0.0337 | for2nur |
| JHallCr_OR | 244 | 30 | 4.887 | 1.3345 | 1.9720 | 3.131 | 0.6834 | 0.3446 | 0.3322 | 0.1031 | for2nur |
| NFChetHigh_OR | 114 | 35 | 6.812 | 1.3423 | 2.7354 | 7.071 | 0.8662 | 0.4211 | 0.1426 | 0.0660 | for2nur |
| Coast_OR | 34 | 12 | 5.911 | 1.1693 | 2.0483 | 5.558 | 0.8449 | 0.6748 | 0.2514 | 0.2828 | for2nur |
| HunterCr_OR | 66 | 4 | 1.876 | 0.6904 | 0.4227 | 1.242 | 0.1977 | 0.4596 | -0.0440 | -0.0471 | for2nur |
| Winchuck_OR | 35 | 9 | 4.291 | 1.0722 | 1.4736 | 2.882 | 0.6723 | 0.5594 | -0.0196 | -0.0138 | for2nur |
| ChetcoMain_OR | 16 | 7 | 5.000 | 0.9258 | 1.4500 | 2.844 | 0.6917 | 0.5652 | 0.3748 | 0.3957 | for2nur |
| PistolRSF_OR | 4 | 2 | 2.000 | 0.0000 | 0.5623 | 1.600 | 0.5000 | 0.7949 | 0.0000 | NaN | for2nur |
| Total | 729 | 98 | 7.920 | 1.2245 | 3.4860 | 15.041 | 0.9348 | 0.4436 | 0.2034 | 0.0651 | for2nur |
invisible(genotype_curve(for2nur, sample = 1000, quiet = TRUE))
This analysis will attempt to explain the differentiation between all populations. Here, I am setting the hierarchy to be by Source (one of Nursery, JHallCr, NFChetHigh, Coast, HunterCr, Winchuck, ChetcoMain, PistolRSF) and State (Oregon or California).
Since we want to avoid overfitting the data, we utilize cross-validation, which performs DAPC on 90% of the data and attempts to predict where the 10% that was left out came from. This is doen 1000 times per number of principal components. The number of principal components with the lowest mean squared error and highest mean successful assignment is then used for the final analysis.
setpop(for2nur) <- ~SOURCE/STATE
set.seed(9001)
system.time(xval <- xvalDapc(for2nur@tab, pop(for2nur), n.pca = 5:15,
result = "overall", n.rep = 1000))
## NULL
## user system elapsed
## 235.488 2.424 239.619
xval[-1]
## $`Median and Confidence Interval for Random Chance`
## 2.5% 50% 97.5%
## 0.08994 0.11070 0.13914
##
## $`Mean Successful Assignment by Number of PCs of PCA`
## 5 6 7 8 9 10 11 12 13 14
## 0.6714 0.6809 0.7022 0.7221 0.7217 0.7385 0.7387 0.7368 0.7367 0.7472
## 15
## 0.7579
##
## $`Number of PCs Achieving Highest Mean Success`
## [1] "15"
##
## $`Root Mean Squared Error by Number of PCs of PCA`
## 5 6 7 8 9 10 11 12 13 14
## 0.3300 0.3208 0.2991 0.2792 0.2796 0.2630 0.2627 0.2647 0.2647 0.2542
## 15
## 0.2438
##
## $`Number of PCs Achieving Lowest MSE`
## [1] "15"
##
## $DAPC
## #################################################
## # Discriminant Analysis of Principal Components #
## #################################################
## class: dapc
## $call: dapc.data.frame(x = as.data.frame(x), grp = ..1, n.pca = ..2,
## n.da = ..3)
##
## $n.pca: 15 first PCs of PCA used
## $n.da: 8 discriminant functions saved
## $var (proportion of conserved variance): 0.981
##
## $eig (eigenvalues): 837 202.9 97.36 55.12 43.67 ...
##
## vector length content
## 1 $eig 8 eigenvalues
## 2 $grp 729 prior group assignment
## 3 $prior 9 prior group probabilities
## 4 $assign 729 posterior group assignment
## 5 $pca.cent 38 centring vector of PCA
## 6 $pca.norm 38 scaling vector of PCA
## 7 $pca.eig 33 eigenvalues of PCA
##
## data.frame nrow ncol
## 1 $tab 729 15
## 2 $means 9 15
## 3 $loadings 15 8
## 4 $ind.coord 729 8
## 5 $grp.coord 9 8
## 6 $posterior 729 9
## 7 $pca.loadings 38 15
## 8 $var.contr 38 8
## content
## 1 retained PCs of PCA
## 2 group means
## 3 loadings of variables
## 4 coordinates of individuals (principal components)
## 5 coordinates of groups
## 6 posterior membership probabilities
## 7 PCA loadings of original variables
## 8 contribution of original variables
for2nur.dapc <- dapc(for2nur, n.pca = 20, n.da = 8)
# comparePal <- char2pal(c("JHallCr_OR", "NFChetHigh_OR", "Coast_OR",
# "HunterCr_OR", "Winchuck_OR", "ChetcoMain_OR",
# "PistolRSF_OR"))
# (comparePal <- c(Nursery_CA = "#000000", Nursery_OR = "#808080", comparePal))
par(mfrow = c(1, 2))
scatter(for2nur.dapc, col = comparePal, cex = 2, legend = TRUE, clabel = FALSE,
posi.leg = "bottomleft", scree.pca = TRUE, posi.pca = "topright",
cleg = 0.75, xax = 1, yax = 2, inset.solid = 1)
scatter(for2nur.dapc, col = comparePal, cex = 2, legend = TRUE, clabel = FALSE,
posi.leg = "bottomleft", inset.solid = 1,
cleg = 0.75, xax = 3, yax = 2)
par(mfrow = c(1, 1))
# pdf("fig5scatter.pdf", width = 3.5, height = 3.5, pointsize = 10)
# scatter(for2nur.dapc, col = comparePal, cex = 2, legend = TRUE, clabel = FALSE,
# posi.leg = "bottomleft", scree.pca = TRUE, posi.pca = "topright",
# posi.da = "topleft", cleg = 0.75, xax = 1, yax = 2, inset.solid = 1,
# ratio.pca = 0.2, ratio.da = 0.2)
# dev.off()
summary(for2nur.dapc)
## $n.dim
## [1] 8
##
## $n.pop
## [1] 9
##
## $assign.prop
## [1] 0.7805
##
## $assign.per.pop
## Nursery_CA Nursery_OR JHallCr_OR NFChetHigh_OR Coast_OR
## 0.7379 0.8310 0.8689 0.5439 0.6765
## HunterCr_OR Winchuck_OR ChetcoMain_OR PistolRSF_OR
## 0.9848 0.8571 0.6875 0.0000
##
## $prior.grp.size
##
## Nursery_CA Nursery_OR JHallCr_OR NFChetHigh_OR Coast_OR
## 145 71 244 114 34
## HunterCr_OR Winchuck_OR ChetcoMain_OR PistolRSF_OR
## 66 35 16 4
##
## $post.grp.size
##
## Nursery_CA Nursery_OR JHallCr_OR NFChetHigh_OR Coast_OR
## 118 91 251 71 46
## HunterCr_OR Winchuck_OR ChetcoMain_OR PistolRSF_OR
## 73 50 28 1
The above plots show the first three DA coordinates. DA coordinate 2 is on the y axis. Plot 1 contains DA coordinate 1 whereas plot 2 containd DA coordinate 3. Plot 2 can be thought of as plot 1 rotated about the x axis.
setpop(for2nur) <- ~SOURCE/STATE
plot_posterior(for2nur.dapc, for2nur, pal = comparePal) + theme(axis.text.x = element_text(size = 3))
loadingplot(for2nur.dapc$var.contr, axis = 1)
theLocus <- truenames(seploc(for2nur)[["PrMS43A1"]])$tab
theFreqs <- apply(theLocus, 2, function(e) tapply(e, pop(for2nur), mean,
na.rm = TRUE))
colnames(theFreqs) <- sub("PrMS43A1.", "", colnames(theFreqs))
names(dimnames(theFreqs)) <- c("Region", "Allele")
ggplot(melt(theFreqs),
aes(x = Allele, y = value, color = Region, group = Region)) +
geom_vline(aes(xintercept = Allele), alpha = 0.5, linetype = 3) +
geom_point(aes(size = log(value)), show_guide = FALSE) +
geom_line() +
scale_color_manual(values = comparePal) +
facet_wrap(~Region, ncol = 1) +
geom_text(aes(label = ifelse(value >= 0.05, Allele, NA), y = value + 0.05),
color = "black", position = "jitter") +
scale_x_log10() +
labs(list(x = "Allele Frequency")) +
theme_bw()
## Warning: Removed 20 rows containing missing values (geom_text).
## Warning: Removed 19 rows containing missing values (geom_text).
## Warning: Removed 21 rows containing missing values (geom_text).
## Warning: Removed 19 rows containing missing values (geom_text).
## Warning: Removed 19 rows containing missing values (geom_text).
## Warning: Removed 22 rows containing missing values (geom_text).
## Warning: Removed 21 rows containing missing values (geom_text).
## Warning: Removed 19 rows containing missing values (geom_text).
## Warning: Removed 20 rows containing missing values (geom_text).
setpop(for2nur) <- ~SOURCE/YEAR
invisible(source_year <- mlg.crosspop(for2nur))
## MLG.4: (5 inds) JHallCr_2001 JHallCr_2002 JHallCr_2003 JHallCr_2013
## MLG.5: (3 inds) NFChetHigh_2013 ChetcoMain_2013 Winchuck_2013
## MLG.7: (5 inds) Nursery_2006 Nursery_2008 NFChetHigh_2013
## MLG.12: (6 inds) NFChetHigh_2013 NFChetHigh_2014
## MLG.14: (43 inds) JHallCr_2003 NFChetHigh_2012 JHallCr_2013 NFChetHigh_2013
## NFChetHigh_2014
## MLG.15: (5 inds) NFChetHigh_2013 NFChetHigh_2014
## MLG.16: (3 inds) JHallCr_2001 JHallCr_2003 NFChetHigh_2013
## MLG.17: (17 inds) JHallCr_2002 JHallCr_2003 JHallCr_2004 NFChetHigh_2012
## JHallCr_2013 NFChetHigh_2013
## MLG.18: (8 inds) Winchuck_2012 Winchuck_2013 PistolRSF_2013
## MLG.20: (3 inds) JHallCr_2001 JHallCr_2003 JHallCr_2013
## MLG.21: (13 inds) Nursery_0 Nursery_2007 JHallCr_2001 JHallCr_2003
## NFChetHigh_2012 JHallCr_2013 NFChetHigh_2013
## MLG.22: (149 inds) JHallCr_2001 JHallCr_2002 JHallCr_2003 JHallCr_2004
## Coast_2011 JHallCr_2013 NFChetHigh_2013 ChetcoMain_2013 PistolRSF_2013
## Coast_2013 NFChetHigh_2014
## MLG.23: (23 inds) JHallCr_2002 JHallCr_2003 Winchuck_2012 NFChetHigh_2013
## Winchuck_2013
## MLG.24: (7 inds) JHallCr_2001 JHallCr_2003 Winchuck_2012 NFChetHigh_2013
## ChetcoMain_2013 Winchuck_2013
## MLG.27: (11 inds) Nursery_2008 JHallCr_2001 JHallCr_2003 JHallCr_2013
## MLG.28: (25 inds) Nursery_2007 Nursery_2008 JHallCr_2002 JHallCr_2003
## NFChetHigh_2003 JHallCr_2004 JHallCr_2013 NFChetHigh_2013 ChetcoMain_2014
## JHallCr_2014
## MLG.29: (12 inds) Nursery_2011 JHallCr_2002 JHallCr_2003 JHallCr_2013
## NFChetHigh_2013 Winchuck_2013 JHallCr_2014
## MLG.30: (2 inds) JHallCr_2005 NFChetHigh_2013
## MLG.33: (11 inds) Nursery_0 Nursery_2007 Nursery_2008 Nursery_2005 Coast_2010
## Coast_2014
## MLG.36: (2 inds) NFChetHigh_2013 Winchuck_2013
## MLG.41: (2 inds) JHallCr_2001 JHallCr_2013
## MLG.46: (5 inds) Coast_2006 NFChetHigh_2013 Coast_2014
## MLG.51: (6 inds) Nursery_0 Nursery_2004 Nursery_2012 JHallCr_2003
## MLG.52: (15 inds) Nursery_2011 JHallCr_2004 Coast_2011 NFChetHigh_2013
## Coast_2013 Coast_2014
## MLG.53: (17 inds) Coast_2012 NFChetHigh_2013 ChetcoMain_2013 Coast_2013
## Coast_2014 NFChetHigh_2014
## MLG.54: (13 inds) NFChetHigh_2013 ChetcoMain_2013 Coast_2014
## MLG.55: (2 inds) Nursery_0 HunterCr_2011
## MLG.56: (6 inds) Nursery_0 Nursery_2008 Nursery_2012 Coast_2013
## MLG.57: (3 inds) Nursery_2011 Nursery_2012 ChetcoMain_2013
## MLG.59: (60 inds) Nursery_2007 HunterCr_2011
## MLG.67: (2 inds) JHallCr_2003 JHallCr_2004
## MLG.68: (20 inds) JHallCr_2002 JHallCr_2003 JHallCr_2004 JHallCr_2005
## MLG.69: (2 inds) JHallCr_2002 JHallCr_2003
## MLG.70: (3 inds) JHallCr_2003 JHallCr_2004
## MLG.1011: (34 inds) Nursery_0 Nursery_2007 Nursery_2006 Nursery_2008
## Nursery_2004 Nursery_2005 Nursery_2010 Nursery_2011
## MLG.1013: (6 inds) Nursery_2007 Nursery_2006 Nursery_2008 Nursery_2009
## MLG.1014: (14 inds) Nursery_2007 Nursery_2006 Nursery_2008 Nursery_2010
## Nursery_2011 Nursery_2012
## MLG.1016: (2 inds) Nursery_2007 Nursery_2005
## MLG.1021: (9 inds) Nursery_2008 Nursery_2012
## MLG.1025: (42 inds) Nursery_0 Nursery_2007 Nursery_2006 Nursery_2008
## Nursery_2004 Nursery_2000 Nursery_2010 Nursery_2011 Nursery_2012
## MLG.1026: (27 inds) Nursery_2007 Nursery_2008 Nursery_2010 Nursery_2011
## MLG.1032: (8 inds) Nursery_0 Nursery_2003 Nursery_2010
## MLG.1037: (2 inds) Nursery_0 Nursery_2011
## MLG.1038: (8 inds) Nursery_0 Nursery_2007 Nursery_2011 Nursery_2012
source_year <- source_year[vapply(lapply(source_year, names),
function(z) any(grepl("Nursery", z)), logical(1))]
symlg <- mlgFromString(names(source_year))
invisible(mlg.crosspop(for2nur, mlgsub = symlg[symlg < 1000]))
## MLG.7: (5 inds) Nursery_2006 Nursery_2008 NFChetHigh_2013
## MLG.21: (13 inds) Nursery_0 Nursery_2007 JHallCr_2001 JHallCr_2003
## NFChetHigh_2012 JHallCr_2013 NFChetHigh_2013
## MLG.27: (11 inds) Nursery_2008 JHallCr_2001 JHallCr_2003 JHallCr_2013
## MLG.28: (25 inds) Nursery_2007 Nursery_2008 JHallCr_2002 JHallCr_2003
## NFChetHigh_2003 JHallCr_2004 JHallCr_2013 NFChetHigh_2013 ChetcoMain_2014
## JHallCr_2014
## MLG.29: (12 inds) Nursery_2011 JHallCr_2002 JHallCr_2003 JHallCr_2013
## NFChetHigh_2013 Winchuck_2013 JHallCr_2014
## MLG.33: (11 inds) Nursery_0 Nursery_2007 Nursery_2008 Nursery_2005 Coast_2010
## Coast_2014
## MLG.51: (6 inds) Nursery_0 Nursery_2004 Nursery_2012 JHallCr_2003
## MLG.52: (15 inds) Nursery_2011 JHallCr_2004 Coast_2011 NFChetHigh_2013
## Coast_2013 Coast_2014
## MLG.55: (2 inds) Nursery_0 HunterCr_2011
## MLG.56: (6 inds) Nursery_0 Nursery_2008 Nursery_2012 Coast_2013
## MLG.57: (3 inds) Nursery_2011 Nursery_2012 ChetcoMain_2013
## MLG.59: (60 inds) Nursery_2007 HunterCr_2011
nursery_cross <- unique(sub("_.+?$", "", unlist(lapply(source_year, names))))
pops <- levels(gethierarchy(for2nur, ~SOURCE)$SOURCE)
pops[!pops %in% nursery_cross] # regions with no Nursery isolates.
## [1] "PistolRSF"
setpop(for2nur) <- ~SOURCE/STATE
for2nur
##
## This is a genclone object
## -------------------------
## Genotype information:
##
## 98 multilocus genotypes
## 729 diploid individuals
## 5 codominant loci
##
## Population information:
##
## 3 hierarchical levels - SOURCE YEAR STATE
## 9 populations defined - Nursery_CA Nursery_OR JHallCr_OR ...
## Winchuck_OR ChetcoMain_OR PistolRSF_OR
# nf.msn <- bruvo.msn(for2nur, replen = other(ramdat)$REPLEN, include.ties = TRUE,
# showplot = FALSE)
newReps <- other(ramdat)$REPLEN
(newReps[3] <- 4) # Tetranucleotide repeat
## [1] 4
(newReps <- fix_replen(ramdat, newReps))
## PrMS6A1 Pr9C3A1 PrMS39A1 PrMS45A1 PrMS43A1
## 3 2 4 4 4
nf.msn <- bruvo.msn(for2nur, replen = newReps, include.ties = TRUE,
showplot = FALSE)
degs <- igraph::degree(nf.msn$graph)
names(degs) <- igraph::V(nf.msn$graph)$label
sort(degs)
## MLG.1035 MLG.1016 MLG.1040 MLG.1018 MLG.1019 MLG.1039 MLG.1033 MLG.1001
## 1 1 1 1 1 1 1 1
## MLG.1020 MLG.1036 MLG.26 MLG.19 MLG.65 MLG.64 MLG.58 MLG.43
## 1 1 1 1 1 1 1 1
## MLG.40 MLG.11 MLG.44 MLG.42 MLG.38 MLG.9 MLG.62 MLG.10
## 1 1 1 1 1 1 1 1
## MLG.31 MLG.61 MLG.1 MLG.63 MLG.1021 MLG.1015 MLG.1004 MLG.1012
## 1 1 1 1 2 2 2 2
## MLG.1029 MLG.1017 MLG.1002 MLG.69 MLG.67 MLG.66 MLG.25 MLG.5
## 2 2 2 2 2 2 2 2
## MLG.13 MLG.2 MLG.45 MLG.8 MLG.7 MLG.1013 MLG.1037 MLG.55
## 2 2 2 2 3 3 3 3
## MLG.1038 MLG.1034 MLG.20 MLG.41 MLG.70 MLG.32 MLG.6 MLG.37
## 3 3 3 3 3 3 3 3
## MLG.50 MLG.39 MLG.1032 MLG.51 MLG.59 MLG.56 MLG.57 MLG.4
## 3 3 4 4 4 4 4 4
## MLG.24 MLG.16 MLG.68 MLG.3 MLG.46 MLG.54 MLG.36 MLG.15
## 4 4 4 4 4 4 4 4
## MLG.35 MLG.47 MLG.34 MLG.1025 MLG.21 MLG.1014 MLG.1031 MLG.27
## 4 4 4 5 5 5 5 5
## MLG.1006 MLG.52 MLG.29 MLG.14 MLG.60 MLG.30 MLG.18 MLG.12
## 5 5 5 5 5 5 5 5
## MLG.49 MLG.1011 MLG.33 MLG.23 MLG.53 MLG.48 MLG.1026 MLG.28
## 5 6 6 6 6 6 7 7
## MLG.22 MLG.17
## 7 7
edges_to_remove <- E(nf.msn$graph)[E(nf.msn$graph)$weight >= 0.06]
clusts <- clusters(delete.edges(nf.msn$graph, edges_to_remove))
names(clusts$membership) <- V(nf.msn$graph)$label
# clustPal <- clusts$csize
# theClusts <- clustPal > 3
# clustOrder <- order(clustPal, decreasing = TRUE)
# clustPal[clustOrder][theClusts[clustOrder]] <- RColorBrewer::brewer.pal(sum(theClusts), "Set1")
# clustPal[clustOrder][!theClusts[clustOrder]] <- gray.colors(sum(!theClusts))
# nodeList <- lapply(1:length(clustPal), function(x) which(clusts$membership == x))
make_node_list <- function(clusts, pal, cutoff = 3){
PAL <- match.fun(pal)
clustPal <- table(clusts$membership)
theClusts <- clustPal > 3
clustOrder <- order(clustPal, decreasing = TRUE)
clustPal[clustOrder][theClusts[clustOrder]] <- PAL(sum(theClusts))
clustPal[clustOrder][!theClusts[clustOrder]] <- gray.colors(sum(!theClusts))
nodeList <- lapply(1:length(clustPal), function(x) which(clusts$membership == x))
names(nodeList) <- clustPal
return(nodeList)
}
clust_cutoff <- function(clusts, cutoff = 3){
table(clusts$membership) > 3
}
nodeList <- make_node_list(clusts,
function(x) RColorBrewer::brewer.pal(x, "Set1"),
cutoff = 3)
theClusts <- clust_cutoff(clusts, 3)
goodSeed <- 6
# goodSeed <- 17
# goodSeed <- 24
thisPal <- function(x) comparePal[nf.msn$populations]
# for (i in goodSeed:100){
# set.seed(i)
set.seed(goodSeed)
MASTER <- get_layout(nf.msn$graph, LAYOUT = layout.fruchterman.reingold)
plot_poppr_msn(for2nur, nf.msn, gad = 10, palette = thisPal, mlg = TRUE,
layfun = MASTER, nodebase = 1.75, vertex.label.font = 2,
quantiles = FALSE, #inds = unique(ramdat@mlg),
vertex.label.color = "firebrick",
mark.groups = nodeList[theClusts],
mark.border = names(nodeList)[theClusts],
mark.col = transp(names(nodeList)[theClusts], 0.05),
mark.expand = 2,
mark.shape = 0)
# prompt <- paste("save", i, "as seed?")
# accept <- readline(prompt)
# if (substr(accept, 1, 1) == "y"){
# theSeed <- i
# break
# }
# }
# mainMLGs <- order(table(ramdat@mlg), decreasing = TRUE)[1:10]
# pdf("msn_nursery.pdf", width = 352/2.54/10, height = 352/2.54/10, pointsize = 40)
# plot_poppr_msn(for2nur, nf.msn, gad = 10, palette = thisPal, mlg = TRUE,
# layfun = MASTER, nodebase = 1.75, vertex.label.font = 2,
# quantiles = FALSE, inds = mainMLGs, nodelab = 1000,
# vertex.label.color = "firebrick",
# mark.groups = nodeList[theClusts],
# mark.border = names(nodeList)[theClusts],
# mark.col = transp(names(nodeList)[theClusts], 0.05),
# mark.expand = 2,
# mark.shape = 0)
# dev.off()
assignTheme <- theme(
panel.grid.major.y = element_line(color = "gray75", linetype = 3),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_line(color = "gray75"),
panel.grid.minor.x = element_blank(),#element_line(color = "gray75"),
panel.background = element_blank(),
panel.border = element_blank(),
legend.key = element_blank(),
axis.line = element_line(color = "black"),
axis.text = element_text(color = "black"),
axis.ticks = element_line(color = "black")
)
nurlist <- c("Nursery_CA", "Nursery_OR")
z2.dapc <- dapc(popsub(for2nur, blacklist = nurlist, drop = FALSE),
n.pca = 12, n.da = 6)
nur <- popsub(for2nur, nurlist, drop = FALSE)
nurpred <- predict.dapc(z2.dapc, newdata = nur)
plot_posterior(nurpred, nur, pal = comparePal)
ggsave("nursery_predictions_structure.png", width = 183, height = 183,
units = "mm", dpi = 300)
colMeans(nurpred$posterior)
## JHallCr_OR NFChetHigh_OR Coast_OR HunterCr_OR Winchuck_OR
## 0.189817 0.152097 0.596483 0.033956 0.012044
## ChetcoMain_OR PistolRSF_OR
## 0.014192 0.001411
nurpredmat <- matrix(c(
rowMeans(apply(nurpred$posterior, 1, function(x) x >= 0.95)),
rowMeans(apply(nurpred$posterior, 1, function(x) x >= 0.99)),
rowMeans(apply(nurpred$posterior, 1, function(x) x >= 0.999))),
ncol = 3)
dimnames(nurpredmat) <- list(Population = colnames(nurpred$posterior),
`membership probability` = c("95%", "99%", "99.9%"))
signif(nurpredmat, 3)*100
## membership probability
## Population 95% 99% 99.9%
## JHallCr_OR 6.02 0.926 0.000
## NFChetHigh_OR 0.00 0.000 0.000
## Coast_OR 48.10 28.200 0.463
## HunterCr_OR 3.24 3.240 3.240
## Winchuck_OR 0.00 0.000 0.000
## ChetcoMain_OR 1.39 0.926 0.926
## PistolRSF_OR 0.00 0.000 0.000
mean(apply(nurpred$posterior, 1, function(x) all(x <= 0.6)))
## [1] 0.2176
assignmat <- matrix(c(summary(z2.dapc)$assign.per.pop,
summary(for2nur.dapc)$assign.per.pop[-c(1:2)]),
ncol = 2
)
dimnames(assignmat) <- list(Population = colnames(nurpred$posterior),
c("Without Nursery", "With Nursery"))
assignmat
##
## Population Without Nursery With Nursery
## JHallCr_OR 0.9016 0.8689
## NFChetHigh_OR 0.5351 0.5439
## Coast_OR 0.8529 0.6765
## HunterCr_OR 1.0000 0.9848
## Winchuck_OR 0.8857 0.8571
## ChetcoMain_OR 0.6875 0.6875
## PistolRSF_OR 0.0000 0.0000
noseb.gt.10 <- popsub(for2nur, blacklist = c("HunterCr_OR", "PistolRSF_OR"),
drop = FALSE)
set.seed(9001)
system.time(noseb.dapcxval <- xvalDapc(noseb.gt.10@tab, pop(noseb.gt.10),
n.pca = 5:20, result = "overall",
n.rep = 1000))
## NULL
## user system elapsed
## 305.562 2.598 311.087
noseb.dapcxval[-1]
## $`Median and Confidence Interval for Random Chance`
## 2.5% 50% 97.5%
## 0.1177 0.1426 0.1699
##
## $`Mean Successful Assignment by Number of PCs of PCA`
## 5 6 7 8 9 10 11 12 13 14
## 0.6638 0.6934 0.6878 0.6890 0.7073 0.7277 0.7298 0.7274 0.7370 0.7528
## 15 16 17 18 19 20
## 0.7576 0.7621 0.7590 0.7571 0.7587 0.7567
##
## $`Number of PCs Achieving Highest Mean Success`
## [1] "16"
##
## $`Root Mean Squared Error by Number of PCs of PCA`
## 5 6 7 8 9 10 11 12 13 14
## 0.3404 0.3107 0.3165 0.3147 0.2971 0.2772 0.2746 0.2772 0.2679 0.2523
## 15 16 17 18 19 20
## 0.2469 0.2426 0.2460 0.2482 0.2467 0.2483
##
## $`Number of PCs Achieving Lowest MSE`
## [1] "16"
##
## $DAPC
## #################################################
## # Discriminant Analysis of Principal Components #
## #################################################
## class: dapc
## $call: dapc.data.frame(x = as.data.frame(x), grp = ..1, n.pca = ..2,
## n.da = ..3)
##
## $n.pca: 16 first PCs of PCA used
## $n.da: 6 discriminant functions saved
## $var (proportion of conserved variance): 0.982
##
## $eig (eigenvalues): 359.7 122.3 71.24 59.53 31.3 ...
##
## vector length content
## 1 $eig 6 eigenvalues
## 2 $grp 659 prior group assignment
## 3 $prior 7 prior group probabilities
## 4 $assign 659 posterior group assignment
## 5 $pca.cent 38 centring vector of PCA
## 6 $pca.norm 38 scaling vector of PCA
## 7 $pca.eig 32 eigenvalues of PCA
##
## data.frame nrow ncol
## 1 $tab 659 16
## 2 $means 7 16
## 3 $loadings 16 6
## 4 $ind.coord 659 6
## 5 $grp.coord 7 6
## 6 $posterior 659 7
## 7 $pca.loadings 38 16
## 8 $var.contr 38 6
## content
## 1 retained PCs of PCA
## 2 group means
## 3 loadings of variables
## 4 coordinates of individuals (principal components)
## 5 coordinates of groups
## 6 posterior membership probabilities
## 7 PCA loadings of original variables
## 8 contribution of original variables
noseb.dapc <- dapc(noseb.gt.10, n.pca = 16, n.da = 6)
scatter(noseb.dapc, col = comparePal[noseb.gt.10@pop.names], cex = 2,
legend = TRUE, clabel = FALSE,
posi.leg = "bottomleft", scree.pca = TRUE, posi.pca = "topright",
cleg = 0.75, xax = 1, yax = 2, inset.solid = 1)
Inertia ellipses represent 95% data.
sebdat <- popsub(for2nur, c("HunterCr_OR", "PistolRSF_OR"), drop = FALSE)
sebpred <- predict.dapc(noseb.dapc, sebdat)
plot_posterior(sebpred, sebdat, pal = comparePal)
scatter(noseb.dapc, col = comparePal[noseb.gt.10@pop.names],
cex = 2, scree.da = FALSE, cellipse = 2.5,
legend = TRUE, clabel = FALSE,
posi.leg = "bottomright", posi.pca = "topright",
cleg = 0.75, xax = 1, yax = 2, inset.solid = 1)
par(xpd = TRUE)
sebpch <- c(HunterCr_OR = 15, PistolRSF_OR = 17)
points(sebpred$ind.scores[, 1], sebpred$ind.scores[, 2],
pch = sebpch[as.character(pop(sebdat))],
col = transp(comparePal[sebdat@pop.names], 0.2),
cex = 3)
legend("topright", legend = c("HunterCr", "PistolRSF"), pch = sebpch,
col = comparePal[sebdat@pop.names], cex = 0.75)
add.scatter.eig(noseb.dapc$eig, 15, 1, 2, posi="bottomleft", inset=.02)
sebpredmat <- matrix(c(
rowMeans(apply(sebpred$posterior[1:66, ], 1, function(x) x >= 0.5)),
rowMeans(apply(sebpred$posterior[1:66, ], 1, function(x) x >= 0.95)),
rowMeans(apply(sebpred$posterior[1:66, ], 1, function(x) x >= 0.99)),
rowMeans(apply(sebpred$posterior[1:66, ], 1, function(x) x >= 0.999))),
ncol = 4)
dimnames(sebpredmat) <- list(Population = colnames(sebpred$posterior),
`membership probability` = c("50%", "95%", "99%",
"99.9%"))
signif(sebpredmat, 3)
## membership probability
## Population 50% 95% 99% 99.9%
## Nursery_CA 1 0.924 0.924 0.0152
## Nursery_OR 0 0.000 0.000 0.0000
## JHallCr_OR 0 0.000 0.000 0.0000
## NFChetHigh_OR 0 0.000 0.000 0.0000
## Coast_OR 0 0.000 0.000 0.0000
## Winchuck_OR 0 0.000 0.000 0.0000
## ChetcoMain_OR 0 0.000 0.000 0.0000
scatter(z2.dapc, cex = 2, legend = TRUE, clabel = FALSE,
col = comparePal[names(z2.dapc$prior)], scree.pca = FALSE,
posi.leg = "bottomleft", posi.pca = "topleft",
cleg = 0.75, bg="white", scree.da=0, pch=20,
xlim = c(-4, 15), ylim = range(z2.dapc$ind.coord[, 2]) + c(-0.01, 0.01),
solid = 0.5)
par(xpd=TRUE)
nurpch <- c(Nursery_CA = 15, Nursery_OR = 17)
points(nurpred$ind.scores[, 1], nurpred$ind.scores[, 2],
pch = nurpch[as.character(pop(nur))], col = transp("black", 0.2), cex = 2)
legend("topright", legend = c("Nursery (CA)", "Nursery (OR)"), pch = nurpch,
cex = 0.75)
add.scatter.eig(z2.dapc$eig, 15, 1, 2, posi="bottomright", inset=.02)
ggplot(melt(assignmat), aes(y = Population, x = value, shape = Var2)) +
geom_point(size = 4, alpha = 0.75) +
labs(list(y = "Population", x = "rate of successful reassignment",
shape = "",
title = "Posterior values from DAPC with and without Nursery data")
) + assignTheme
ggsave("reassignment_plot.png", width = 183, height = 61, units = "mm",
dpi = 300)
ggplot(melt(nurpredmat),
aes(y = Population, x = value, size = `membership probability`)) +
geom_point(alpha = 0.75) +
assignTheme +
scale_size_discrete(range = c(6, 3)) +
labs(list(x = "Percent successful assignment",
size = "membership\nprobability",
title = "Posterior values for predictions of nursery genotypes"))
ggsave("nursery_predictions.png", width = 183, height = 61, units = "mm",
dpi = 300)
myMLG <- myPal
nurMLG <- char2pal(sort(unique(for2nur@mlg[for2nur@mlg > 70])), grey.colors)
names(nurMLG) <- paste("MLG", names(nurMLG), sep = ".")
myMLG <- c(myMLG, nurMLG)
date()
## [1] "Thu Nov 27 13:45:48 2014"
set.seed(5555)
system.time(try(for2nur.tree <- bruvo.boot(for2nur,
replen = newReps,#other(for2nur)$REPLEN,
tree = "nj", showtree = FALSE,
quiet = TRUE, sample = 100)))
## Warning: Some branch lengths of the tree are negative. Normalizing
## branches according to Kuhner and Felsenstein (1994)
## user system elapsed
## 522.78 4.43 531.39
date()
## [1] "Thu Nov 27 13:54:39 2014"
for2nur.tree$tip.label <- paste("MLG", for2nur@mlg, sep = ".")
for2nur.tree <- ladderize(for2nur.tree)
# pdf("~/Downloads/testree.pdf", width = 12, height = 80)
par(mfrow = c(1, 2))
plot.phylo(for2nur.tree, cex = 0.8, font = 2, adj = 0,
label.offset = 1/800, main = "color by MLG",
tip.col = myMLG[for2nur.tree$tip.label])
nodelabels(ifelse(for2nur.tree$node.label > 50, for2nur.tree$node.label, NA),
adj = c(1.3, -0.5), frame = "n",
cex = 0.8, font = 3, xpd = TRUE)
add.scale.bar(lwd = 5)
plot.phylo(for2nur.tree, cex = 0.8, font = 2, adj = 0,
label.offset = 1/800, main = "color by Population",
tip.col = comparePal[pop(for2nur)])
nodelabels(ifelse(for2nur.tree$node.label > 50, for2nur.tree$node.label, NA),
adj = c(1.3, -0.5), frame = "n",
cex = 0.8, font = 3, xpd = TRUE)
add.scale.bar(lwd = 5)
legend("topright", legend = names(comparePal), fill = comparePal, bty = "n",
border = NULL)
par(mfrow = c(1, 1))
# dev.off()
set.seed(5001)
system.time(mlg.tree <- bruvo.boot(clonecorrect(for2nur, hier = NA),
replen = newReps,#other(for2nur)$REPLEN,
tree = "nj",
sample = 1000,
quiet = TRUE,
showtree = FALSE))
## user system elapsed
## 74.814 0.437 75.609
When Everett asked about the Joe Hall outbreak, he asked if the genotypes that had jumped accross the drainage appeared to be different than the ones on the West side. Looking at this, I saw that genotype 51 had appeared in 2003. This was a genotype from the second largest group in the MSN that has been expanding in current years.
Looking at the genotypes, I noticed that the genotypes from that group had a different set of alleles at PrMS39. Plotting these, we can see that they segregate quite well on the tree (which makes sense as it partially contributes to the structure).
mlg.tree$tip.label <- paste("MLG", clonecorrect(for2nur, hier = NA)@mlg, sep = ".")
mlg.tree <- ladderize(mlg.tree)
gts <- genind2df(clonecorrect(for2nur, hier = NA), sep = "|", usepop = FALSE)[[3]]
gtPal <- char2pal(gts, function(x) RColorBrewer::brewer.pal(x, "Dark2"))
oldTips <- mlg.tree$tip.label
mlg.tree$tip.label <- paste(oldTips, gts)
# png("~/Downloads/newtree.png", width = 10, height = 15, units = "in", res = 300)
par(mfrow = c(1, 2))
plot.phylo(mlg.tree, cex = 0.55, font = 2, adj = 0,
label.offset = 1/800,
tip.col = gtPal[gts], main = "color by locus PrMS39")
nodelabels(ifelse(mlg.tree$node.label > 50, mlg.tree$node.label, NA),
adj = c(1.3, -0.5), frame = "n",
cex = 0.8, font = 3, xpd = TRUE)
add.scale.bar(lwd = 5)
mlg.tree$tip.label <- oldTips
plot.phylo(mlg.tree, cex = 0.8, font = 2, adj = 0,
label.offset = 1/800,
tip.col = names(nodeList)[clusts$membership],
main = "color by MSN cluster\nwith one mutational step")
nodelabels(ifelse(mlg.tree$node.label > 50, mlg.tree$node.label, NA),
adj = c(1.3, -0.5), frame = "n",
cex = 0.8, font = 3, xpd = TRUE)
add.scale.bar(lwd = 5)
par(mfrow = c(1, 1))
# dev.off()
setpop(ramdat) <- ~Pop
# setpop(ramdat) <- ~ZONE2
lociAlleleFreqs <- lapply(seploc(ramdat), function(z){
x <- apply(truenames(z)$tab, 2,
function(e) tapply(e, pop(ramdat), mean, na.rm = TRUE))
names(dimnames(x)) <- c("Year", "Allele")
# names(dimnames(x)) <- c("Region", "Allele")
dimnames(x)[[2]] <- sub("Pr.+?A1.", "", dimnames(x)[[2]])
return(x)
})
ramdatcc <- clonecorrect(ramdat, ~Pop)
lociAlleleFreqscc <- lapply(seploc(ramdatcc), function(z){
x <- apply(truenames(z)$tab, 2,
function(e) tapply(e, pop(ramdatcc), mean, na.rm = TRUE))
names(dimnames(x)) <- c("Year", "Allele")
dimnames(x)[[2]] <- sub("Pr.+?A1.", "", dimnames(x)[[2]])
return(x)
})
bad_years <- as.character(2005:2011)
lociAlleleFreqs <- melt(lociAlleleFreqs) %>% filter(!Year %in% bad_years)
lociAlleleFreqscc <- melt(lociAlleleFreqscc) %>% filter(!Year %in% bad_years)
ggplot(lociAlleleFreqs, aes(x = Year, y = value, color = value,
# lociAlleleFreqs <- melt(lociAlleleFreqs)
# ggplot(lociAlleleFreqs, aes(x = Year, y = value, color = value,
group = Allele)) +
geom_line() +
geom_text(aes(label = Allele), fontface = "bold") +
scale_x_continuous(breaks = as.numeric(ramdat@pop.names[c(1:4, 9:11)])) +
facet_wrap(~L1) +
assignTheme +
labs(list(title = "Allele frequencies in forest data",
y = "allele frequency",
color = "frequency"))
ggplot(filter(lociAlleleFreqs, L1 == "PrMS39A1"),
# aes(x = Region, y = value, color = factor(Allele), group = Allele)) +
aes(x = Year, y = value, color = factor(Allele), group = Allele)) +
# geom_area(aes(fill = factor(Allele)), stat = "identity", alpha = 0.75) +
geom_line(aes(linetype = "one")) +
geom_line(data = filter(lociAlleleFreqscc, L1 == "PrMS39A1"),
aes(linetype = "two")) +
geom_point(size = 2) +
geom_point(size = 2, data = filter(lociAlleleFreqscc, L1 == "PrMS39A1"),
pch = 1) +
# geom_bar(aes(fill = factor(Allele)), position = "fill", stat = "identity") +
scale_x_continuous(breaks = as.numeric(ramdat@pop.names[c(1:4, 9:11)])) +
scale_linetype_manual(breaks = c("one", "two"), values = 1:2,
name = "data",
guide = "legend",
labels = c("Full", "Clone\nCorrected")) +
assignTheme +
labs(list(title = "Allele frequencies in PrMS39",
y = "allele frequency",
color = "allele")) +
scale_color_brewer(type = "div", palette = "Set1")
ggsave(filename = "PrMS39_alleles.png", width = 183, height = 88, units = "mm")
assoc <- matrix(c(for2nur.tree$tip.label, for2nur.tree$tip.label), ncol = 2)
cophyloplot(for2nur.tree, mlg.tree, assoc, use.edge.length = FALSE, space = 1000,
show.tip.label = FALSE,
col = myMLG[assoc[, 1]], lwd = 3, gap = 1)
Since DAPC showed Cape Sebastian isolates clustring together and we see that nursery isolates are clustering around the cape sebastian isolates in the MSN, it would be a good idea to include the Nursery data within the population tree from Nei’s distance.
neiboot <- function(x, sample = 1000, color_by = "black"){
x <- genind2genpop(x, quiet = TRUE)
outTree <- aboot(x, sample = sample, cutoff = 50, quiet = TRUE, tree = "nj",
showtree = FALSE)
plot.phylo(outTree, xpd = TRUE, font = 2, cex = 0.8, type = "u",
label.offset = 0.005, adj = 0, tip.col = color_by)
nodelabels(outTree$node.label, xpd = TRUE, frame = "n", cex = 0.8, font = 3)#,
# adj = c(1.3, -0.5))
# axisPhylo(3)
add.scale.bar(lwd = 5)
return(outTree)
}
set.seed(5555)
source_state <- neiboot(for2nur, sample = 10000,
color_by = comparePal[for2nur@pop.names])
## Warning: Infinite values detected.
set.seed(5555)
source_statecc <- neiboot(clonecorrect(for2nur, ~SOURCE/STATE, combine = TRUE),
sample = 10000,
color_by = comparePal[for2nur@pop.names])
## Warning: Infinite values detected.
set.seed(5555)
source_ <- neiboot(setpop(for2nur, ~SOURCE), sample = 10000,
color_by = comparePal[for2nur@pop.names[-2]])
## Warning: Infinite values detected.
set.seed(5555)
source_cc <- neiboot(clonecorrect(for2nur, ~SOURCE), sample = 10000,
color_by = comparePal[for2nur@pop.names[-2]])
## Warning: Infinite values detected.
add.tip.color <- function(x, tip_colors){
tips <- x$tip.label
colors <- paste0("[&!color=", tip_colors, "]")
x$tip.label <- paste0(tips, colors)
return(x)
}
round.nodes <- function(x){
x$node.label <- round(x$node.label)
return(x)
}
mywrite.nexus <- function(x, tip_colors){
theFile <- paste(substitute(x), "nursery.nex", sep = "_")
x <- round.nodes(add.tip.color(x, tip_colors))
write.nexus(x, file = theFile)
}
mywrite.nexus(source_state, comparePal[for2nur@pop.names])
mywrite.nexus(source_statecc, comparePal[for2nur@pop.names])
mywrite.nexus(source_, comparePal[for2nur@pop.names[-2]])
mywrite.nexus(source_cc, comparePal[for2nur@pop.names[-2]])